library(tidyverse)
library(lubridate)
library(leaflet)
incidents <- feather::read_feather("incidents.feather")
drivers <- feather::read_feather("drivers.feather")
non_motorists <- feather::read_feather("non_motorists.feather")
single_color <- "#3182bd"
basic_theme <- theme(
plot.title = element_text(size = 24, vjust = .5, hjust = .5),
axis.title = element_text(size = 16, hjust = .5, vjust = .5),
panel.background = element_rect(fill = "white"),
legend.position = "none",
panel.grid.major.y = element_line(color = "grey75", size = .1, linetype = "solid")
)
This is a quick overview of the structure of the Incidents table.
skimr::skim_to_wide(incidents) %>% DT::datatable()
## Warning: 'skimr::skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
There have been 146 fatal crashes out of 58131 recorded incidents since this data set began in 2015.
incidents %>%
group_by(acrs_report_type) %>%
count() %>%
knitr::kable()
| acrs_report_type | n |
|---|---|
| Fatal Crash | 146 |
| Injury Crash | 20787 |
| Property Damage Crash | 37198 |
This interactive map shows the locations of the incidents in the data set. The 112 incidents that do not fall within the limits of the county are colored red.
pal <- colorFactor(c("blue", "red"), domain = c(0, 1))
incidents %>%
leaflet() %>%
addTiles() %>%
addCircleMarkers(lng = ~longitude, lat = ~latitude, radius = 3, stroke = TRUE, weight = 2, opacity = 1, color = ~ pal(not_in_county))
Crashes peak around 5:00 PM, which lines up with the evening rush hour. It seems that the report times are often rounded to the nearest fifteen or five minute round number.
incidents %>%
select(incident_hour, incident_minute) %>%
mutate(incident_time = 60 * incident_hour + incident_minute) %>%
ggplot() +
stat_count(aes(x = incident_time), fill = single_color) +
scale_x_continuous(
name = "Time",
breaks = seq.int(from = 0, to = 24 * 60, by = 120),
labels = c(paste0(seq.int(from = 0, to = 22, by = 2), ":00"), "0:00")
) +
ggtitle("Crashes by Time of Day") +
basic_theme
Crashes are slightly less frequent on the weekends than on weekdays.
incidents %>%
select(incident_weekday) %>%
ggplot() +
stat_count(aes(x = incident_weekday), fill = single_color) +
scale_x_continuous(
name = "Day of Week",
breaks = 1:7,
labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thurday", "Friday", "Saturday")
) +
ggtitle("Crashes by Day of Week") +
basic_theme
The number of crashes seems to be fairly constant over time.
incidents %>%
select(incident_year) %>%
ggplot() +
stat_count(aes(x = incident_year), fill = single_color) +
ggtitle("Crashes by Year") +
basic_theme
This table contains the number of crashes recorded on each date. 2019 is excluded because the year hasn’t concluded and the counts for dates that already have happened would be inflated. The dates with the most crashes tend to come from the last four months of the year, but holidays including Christmas and New Years are among the days with the fewest crashes.
incidents %>%
filter(incident_year < 2019) %>%
select(incident_month, incident_day) %>%
mutate(incident_date = paste0(incident_month, "/", incident_day)) %>%
count(incident_date) %>%
arrange(desc(n)) %>%
rename(Date = incident_date, `Number of Crashes` = n) %>%
DT::datatable()
Most crashes come under clear weather conditions.
incidents %>%
group_by(weather) %>%
count() %>%
arrange(desc(n)) %>%
knitr::kable()
| weather | n |
|---|---|
| CLEAR | 38224 |
| RAINING | 7127 |
| CLOUDY | 6103 |
| N/A | 4894 |
| SNOW | 676 |
| UNKNOWN | 361 |
| FOGGY | 233 |
| WINTRY MIX | 171 |
| OTHER | 140 |
| SLEET | 77 |
| SEVERE WINDS | 64 |
| BLOWING SNOW | 54 |
| BLOWING SAND SOIL DIRT | 7 |
Crashes involving alcohol and illegal drugs constitute a small minority of the total incidents.
incidents %>%
mutate(alcohol = driver_substance_abuse_alcohol_present | driver_substance_abuse_alcohol_contributed) %>%
count(alcohol) %>%
knitr::kable()
| alcohol | n |
|---|---|
| FALSE | 54857 |
| TRUE | 3274 |
incidents %>%
count(driver_substance_abuse_alcohol_contributed) %>%
knitr::kable()
| driver_substance_abuse_alcohol_contributed | n |
|---|---|
| 0 | 57250 |
| 1 | 881 |
incidents %>%
count(driver_substance_abuse_alcohol_present) %>%
knitr::kable()
| driver_substance_abuse_alcohol_present | n |
|---|---|
| 0 | 55735 |
| 1 | 2396 |
incidents %>%
mutate(illegal_drug = driver_substance_abuse_illegal_drug_present | driver_substance_abuse_illegal_drug_contributed) %>%
count(illegal_drug) %>%
knitr::kable()
| illegal_drug | n |
|---|---|
| FALSE | 57909 |
| TRUE | 222 |
incidents %>%
count(driver_substance_abuse_illegal_drug_contributed) %>%
knitr::kable()
| driver_substance_abuse_illegal_drug_contributed | n |
|---|---|
| 0 | 58084 |
| 1 | 47 |
incidents %>%
count(driver_substance_abuse_illegal_drug_present) %>%
knitr::kable()
| driver_substance_abuse_illegal_drug_present | n |
|---|---|
| 0 | 57956 |
| 1 | 175 |
Crashes involving alcohol do not follow the general time of day trends of crashes as a whole. The late night and early morning hours see more crashes than the daylight hours.
incidents %>%
filter(driver_substance_abuse_alcohol_present | driver_substance_abuse_alcohol_contributed) %>%
select(incident_hour, incident_minute) %>%
mutate(incident_time = 60 * incident_hour + incident_minute) %>%
ggplot() +
stat_count(aes(x = incident_time), fill = single_color) +
scale_x_continuous(
name = "Time",
breaks = seq.int(from = 0, to = 24 * 60, by = 120),
labels = paste0(seq.int(from = 0, to = 24, by = 2), ":00")
) +
ggtitle("Crashes Involving Alcohol by Time of Day") +
basic_theme
Weekends see more alchol related crashes than weekdays, the opposite of the trend for all crashes.
incidents %>%
filter(driver_substance_abuse_alcohol_present | driver_substance_abuse_alcohol_contributed) %>%
select(incident_weekday) %>%
ggplot() +
stat_count(aes(x = incident_weekday), fill = single_color) +
scale_x_continuous(
name = "Day of Week",
breaks = 1:7,
labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thurday", "Friday", "Saturday")
) +
ggtitle("Crashes Involving Alcohol by Day of Week") +
basic_theme
Overview of the drivers table structure.
skimr::skim_to_wide(drivers) %>% DT::datatable()
## Warning: 'skimr::skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
81.33% of drivers did not sustain any reported injuries. 0.91% of Drivers suffered serious or fatal injuries.
ordered_levels <- drivers %>%
count(injury_severity) %>%
arrange(desc(n)) %>%
pull(injury_severity)
drivers %>%
mutate(`Injury Severity` = factor(injury_severity, ordered_levels)) %>%
ggplot() +
stat_count(aes(x = `Injury Severity`), fill = single_color) +
ggtitle("Severity of Driver Injuries") +
basic_theme
drivers %>%
count(injury_severity) %>%
arrange(n) %>%
knitr::kable()
| injury_severity | n |
|---|---|
| FATAL INJURY | 71 |
| SUSPECTED SERIOUS INJURY | 876 |
| SUSPECTED MINOR INJURY | 7763 |
| POSSIBLE INJURY | 10647 |
| NO APPARENT INJURY | 84334 |
Most reported accidents involved at least some vehicle damage. Just 3.73% of drivers reported no vehicle damage.
ordered_levels <- drivers %>%
count(vehicle_damage_extent) %>%
arrange(desc(n)) %>%
pull(vehicle_damage_extent)
drivers %>%
mutate(`Vehicle Damage` = factor(vehicle_damage_extent, ordered_levels)) %>%
ggplot() +
stat_count(aes(x = `Vehicle Damage`), fill = single_color) +
ggtitle("Severity of Vehicle Damage") +
basic_theme
drivers %>%
count(vehicle_damage_extent) %>%
arrange(desc(n)) %>%
knitr::kable()
| vehicle_damage_extent | n |
|---|---|
| DISABLING | 36731 |
| FUNCTIONAL | 28010 |
| SUPERFICIAL | 27217 |
| DESTROYED | 4081 |
| NO DAMAGE | 3874 |
| UNKNOWN | 3531 |
| N/A | 189 |
| OTHER | 58 |
drivers %>%
ggplot() +
stat_count(aes(x = speed_limit, fill = ), fill = single_color) +
ggtitle("Speed Limits") +
basic_theme
ordered_injury <- drivers %>%
count(injury_severity) %>%
arrange(desc(n)) %>%
pull(injury_severity)
drivers %>%
mutate(
`Speed Limit` = factor(c("LOW", "MEDIUM", "HIGH")[1 + findInterval(speed_limit, c(31, 46))], levels = c("LOW", "MEDIUM", "HIGH")),
`Injury Severity` = factor(injury_severity, ordered_injury)
) %>%
ggplot() +
stat_count(aes(x = `Injury Severity`, fill = `Speed Limit`)) +
basic_theme +
theme(legend.position = c(.9, .9)) +
ggtitle("Injuries and Speed Limits")
ordered_injury <- drivers %>%
count(injury_severity) %>%
arrange(desc(n)) %>%
pull(injury_severity)
drivers %>%
mutate(
speed_limit = factor(c("LOW", "MEDIUM", "HIGH")[1 + findInterval(speed_limit, c(31, 46))], levels = c("LOW", "MEDIUM", "HIGH")),
injury_severity = factor(injury_severity, ordered_injury)
) %>%
group_by(speed_limit, injury_severity) %>%
tally() %>%
rename(`Speed Limit` = speed_limit, `Injury Severity` = injury_severity) %>%
ggplot() +
geom_bar(aes(x = `Injury Severity`, weight = n, fill = `Speed Limit`), position = "fill") +
basic_theme +
theme(legend.position = "top") +
ggtitle("Injuries and Speed Limits") +
ylab("Proportion")
ordered_levels <- drivers %>%
count(vehicle_body_type) %>%
arrange(desc(n)) %>%
pull(vehicle_body_type)
drivers %>%
mutate(`Body Type` = factor(as.character(fct_collapse(vehicle_body_type, OTHER = ordered_levels[-(1:5)])), c(ordered_levels[1:5], "OTHER"))) %>%
ggplot() +
stat_count(aes(x = `Body Type`), fill = single_color) +
ggtitle("Vehicle Body Types") +
basic_theme
Overview of Non-Motorists table structure
skimr::skim_to_wide(non_motorists) %>% DT::datatable()
## Warning: 'skimr::skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
Pedestrians are the most common type of non motorist, followed by bicyclists.
ordered_levels <- non_motorists %>%
count(pedestrian_type) %>%
arrange(desc(n)) %>%
pull(pedestrian_type)
non_motorists %>%
mutate(pedestrian_type = factor(pedestrian_type, ordered_levels)) %>%
ggplot() +
stat_count(aes(x = pedestrian_type), fill = single_color) +
basic_theme +
ggtitle("Types of Non-Motorists")
pal <- colorFactor(c("blue", "red"), domain = c(0, 1))
non_motorists %>%
leaflet() %>%
addTiles() %>%
addCircleMarkers(lng = ~longitude, lat = ~latitude, radius = 3, stroke = TRUE, weight = 2, opacity = 1, color = ~ pal(not_in_county))
8.67% of non-motorists did not sustain any reported injuries. 13.12% of non-motorists suffered serious or fatal injuries.
ordered_levels <- non_motorists %>%
count(injury_severity) %>%
arrange(desc(n)) %>%
pull(injury_severity)
non_motorists %>%
mutate(`Injury Severity` = factor(injury_severity, ordered_levels)) %>%
ggplot() +
stat_count(aes(x = `Injury Severity`), fill = single_color) +
ggtitle("Severity of Non Motorist Injuries") +
basic_theme
non_motorists %>%
count(injury_severity) %>%
arrange(n) %>%
knitr::kable()
| injury_severity | n |
|---|---|
| FATAL INJURY | 64 |
| NO APPARENT INJURY | 285 |
| SUSPECTED SERIOUS INJURY | 367 |
| POSSIBLE INJURY | 1082 |
| SUSPECTED MINOR INJURY | 1486 |
ordered_levels <- non_motorists %>%
filter(pedestrian_type == "BICYCLIST") %>%
count(safety_equipment) %>%
arrange(desc(n)) %>%
pull(safety_equipment)
non_motorists %>%
filter(pedestrian_type == "BICYCLIST") %>%
mutate(`Safety Equipment` = factor(safety_equipment, ordered_levels)) %>%
ggplot() +
stat_count(aes(x = `Safety Equipment`), fill = single_color) +
basic_theme +
ggtitle("What Kinds of Safety Equiptment Do Bicyclists Use?")
The more severe injuries tend to have lower rates of helmet usage than minor injuries.
ordered_safety <- non_motorists %>%
filter(pedestrian_type == "BICYCLIST") %>%
count(safety_equipment) %>%
arrange(desc(n)) %>%
pull(safety_equipment)
ordered_injury <- non_motorists %>%
filter(pedestrian_type == "BICYCLIST") %>%
count(injury_severity) %>%
arrange(desc(n)) %>%
pull(injury_severity)
non_motorists %>%
filter(pedestrian_type == "BICYCLIST") %>%
mutate(
`Safety Equipment` = fct_collapse(safety_equipment, OTHER = ordered_safety[-(1:2)]),
`Injury Severity` = factor(injury_severity, ordered_injury)
) %>%
ggplot() +
stat_count(aes(x = `Injury Severity`, fill = `Safety Equipment`)) +
basic_theme +
theme(legend.position = c(.9, .9)) +
ggtitle("Injuries and Helmets")
fatal_drivers <- drivers %>%
filter(injury_severity_fatal_injury == 1) %>%
pull(report_number)
fatal_nonm <- non_motorists %>%
filter(injury_severity_fatal_injury == 1) %>%
pull(report_number)
fatal_acrs <- incidents %>%
filter(acrs_report_type_fatal_crash == 1) %>%
pull(report_number)
recorded_fatal <- c(fatal_drivers, fatal_nonm)
# vector of fatal reports with no recorded fatality
probs <- fatal_acrs %>% .[!. %in% recorded_fatal]
incidents %>%
filter(report_number %in% probs) %>%
DT::datatable(options = list(pageLength = 5))
drivers %>%
filter(report_number %in% probs) %>%
DT::datatable(options = list(pageLength = 5))
non_motorists %>%
filter(report_number %in% probs) %>%
DT::datatable(options = list(pageLength = 5))
drivers %>%
group_by(report_number) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 57,742 x 2
## report_number n
## <chr> <int>
## 1 MCP12130045 9
## 2 MCP2667000H 8
## 3 MCP1227000M 7
## 4 MCP15800085 7
## 5 MCP23580027 7
## 6 MCP2513001C 7
## 7 MCP2617006L 7
## 8 MCP2871006C 7
## 9 MCP3140000Y 7
## 10 MCP9130001S 7
## # … with 57,732 more rows
drivers %>%
count(report_number) %>%
count(n) %>%
rename(`Vehicles Involved` = n) %>%
knitr::kable()
| Vehicles Involved | nn |
|---|---|
| 1 | 17909 |
| 2 | 34766 |
| 3 | 4207 |
| 4 | 713 |
| 5 | 119 |
| 6 | 17 |
| 7 | 9 |
| 8 | 1 |
| 9 | 1 |
non_motorists %>%
group_by(report_number) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 3,122 x 2
## report_number n
## <chr> <int>
## 1 MCP229800LT 4
## 2 MCP2546002K 4
## 3 DD5603004V 3
## 4 DM8445001H 3
## 5 DM8457000R 3
## 6 DM8463000L 3
## 7 EJ7809000W 3
## 8 EJ7833003W 3
## 9 MCP1438002S 3
## 10 MCP20080044 3
## # … with 3,112 more rows
non_motorists %>%
count(report_number) %>%
count(n) %>%
rename(`non_motorists Involved` = n) %>%
knitr::kable()
| non_motorists Involved | nn |
|---|---|
| 1 | 2985 |
| 2 | 114 |
| 3 | 21 |
| 4 | 2 |
fatal_rate <- drivers %>%
select(report_number, speed_limit) %>%
group_by(report_number) %>%
summarise(speed_limit = max(speed_limit)) %>%
full_join(incidents) %>%
group_by(speed_limit) %>%
summarise(fatal_prop = mean(acrs_report_type_fatal_crash), count = n(), fatal_crashes = sum(acrs_report_type_fatal_crash))
## Joining, by = "report_number"
fatal_rate %>%
filter(!is.na(speed_limit)) %>%
ggplot() + geom_line(aes(x = speed_limit, y = fatal_prop))
temp <- fatal_rate %>% filter(speed_limit > 25)
prop.test(temp$fatal_crashes, temp$count)
## Warning in prop.test(temp$fatal_crashes, temp$count): Chi-squared approximation
## may be incorrect
##
## 9-sample test for equality of proportions without continuity
## correction
##
## data: temp$fatal_crashes out of temp$count
## X-squared = 53.003, df = 8, p-value = 1.077e-08
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4 prop 5 prop 6
## 0.002949474 0.001816836 0.004212744 0.003819527 0.011564212 0.000000000
## prop 7 prop 8 prop 9
## 0.000000000 0.000000000 0.000000000
temp <- fatal_rate %>%
filter(speed_limit > 25) %>%
filter(speed_limit != 50)
prop.test(temp$fatal_crashes, temp$count)
## Warning in prop.test(temp$fatal_crashes, temp$count): Chi-squared approximation
## may be incorrect
##
## 8-sample test for equality of proportions without continuity
## correction
##
## data: temp$fatal_crashes out of temp$count
## X-squared = 17.974, df = 7, p-value = 0.01209
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4 prop 5 prop 6
## 0.002949474 0.001816836 0.004212744 0.003819527 0.000000000 0.000000000
## prop 7 prop 8
## 0.000000000 0.000000000